Packages official documentation
Cheatsheets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Data
x = range(1000)
y = np.random.randn(1000)
z = np.random.randn(1000)
cat = np.random.randint(low=0, high=3, size=1000)
colors = np.random.randint(low=0, high=5, size=1000)
df = pd.DataFrame({'x': x, 'y': y, 'colors': colors, 'cat': cat})
Matplotlib
# From matplotlib.pyplot directly
plt.plot(y)
[<matplotlib.lines.Line2D at 0x13b7c3130>]
# From Pandas DataFrames
df.plot('x', 'y')
<AxesSubplot:xlabel='x'>
Seaborn
# Activate Seaborn nice theme (will also be activated for standard matplotlib graphs)
sns.set_theme()
sns.lineplot(data=df, x='x', y='y')
<AxesSubplot:xlabel='x', ylabel='y'>
Plotly
# You can zoom and hover with the mouse to see data values
px.line(df, 'x', 'y')
Matplotlib
# From matplotlib.pyplot directly
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x14a9006a0>
# From Pandas DataFrames
df.plot.scatter('x', 'y')
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
<AxesSubplot:xlabel='x', ylabel='y'>
Seaborn
sns.scatterplot(data=df, x='x', y='y')
<AxesSubplot:xlabel='x', ylabel='y'>
Plotly
px.scatter(df, x='x', y='y')
Matplotlib
# From matplotlib.pyplot directly
plt.hist(y)
(array([ 4., 16., 60., 146., 243., 274., 164., 71., 16., 6.]),
array([-3.38910972, -2.72965353, -2.07019735, -1.41074116, -0.75128498,
-0.09182879, 0.56762739, 1.22708358, 1.88653976, 2.54599595,
3.20545213]),
<BarContainer object of 10 artists>)
# From Pandas DataFrames
df['y'].hist()
<AxesSubplot:>
Seaborn
sns.histplot(df['y'])
<AxesSubplot:xlabel='y', ylabel='Count'>
Plotly
px.histogram(df['y'])
# Data
import random
colors_1 = ['b'] * 500 + ['r'] * 500
colors_2 = random.choices(['b', 'r'], k=1000)
# From matplotlib.pyplot directly
plt.scatter(x, y, c=colors_1)
<matplotlib.collections.PathCollection at 0x14aac1040>
# From matplotlib.pyplot directly
plt.scatter(x, y, c=colors_2)
<matplotlib.collections.PathCollection at 0x14acad1c0>
Comment est-ce que cela fonctionne ?
# Example simple
x_small = [0, 1, 2, 3, 4]
y_small = [9, 3, 4, 5, 8]
colors = ['b', 'b', 'b', 'r', 'b']
plt.scatter(x_small, y_small, c=colors)
<matplotlib.collections.PathCollection at 0x14ad3d730>
# Data
sizes_1 = [10] * 500 + [40] * 500
sizes_2 = random.choices([10, 40], k=1000)
# From matplotlib.pyplot directly
plt.scatter(x, y, c=colors_1, s=sizes_1)
<matplotlib.collections.PathCollection at 0x14ad99820>
# From matplotlib.pyplot directly
plt.scatter(x, y, c=colors_2, s=sizes_2)
<matplotlib.collections.PathCollection at 0x14adf9100>
Comment est-ce que cela fonctionne ?
# From matplotlib.pyplot directly
plt.scatter(x, y, c=colors_1, marker='+')
<matplotlib.collections.PathCollection at 0x14ae6cf70>
# Solution 2
plt.scatter(x[500:], y[500:], c=colors_1[500:], marker='+')
plt.scatter(x[:500], y[:500], c=colors_1[:500], marker='o')
<matplotlib.collections.PathCollection at 0x14aed5b80>
sns.pairplot(df, hue='cat', height=2.5)
<seaborn.axisgrid.PairGrid at 0x14ad14460>
g = sns.jointplot(x="x", y="y", data=df, kind="reg")
import plotly.graph_objs as go
parent_data = df[['colors', 'cat']].groupby('cat').value_counts()
parent_data = parent_data.reset_index()
color_map = {0:'red', 1:'white', 2:'blue', 3:'green', 4:'pink'}
cat_map = {0:'cat_0', 1:'cat_1', 2:'cat_2'}
parent_data['colors'] = parent_data['colors'].replace(color_map)
parent_data['cat'] = parent_data['cat'].replace(cat_map)
fig = px.sunburst(parent_data, path=['cat', 'colors'], values=0)
fig.show()
Aller plus loin:
import missingno as msno
df_with_na = df.copy()
# Random
idx_to_add_na_x = np.random.randint(df.shape[0], size=100)
# Correlated
idx_to_add_na_colors = np.random.randint(df.shape[0], size=60)
idx_to_add_na_cat = np.random.choice(idx_to_add_na_colors, size=40)
# Negatively correlated
remaining_idx_choices = df_with_na.index.drop(idx_to_add_na_x)
idx_to_add_na_y = np.random.choice(remaining_idx_choices, size=100)
# Add nans
df_with_na.loc[idx_to_add_na_y, 'y'] = np.nan
df_with_na.loc[idx_to_add_na_x, 'x'] = np.nan
df_with_na.loc[idx_to_add_na_colors, 'colors'] = np.nan
df_with_na.loc[idx_to_add_na_cat, 'cat'] = np.nan
df_with_na
| x | y | colors | cat | |
|---|---|---|---|---|
| 0 | 0.0 | 0.191557 | 1.0 | 0.0 |
| 1 | NaN | 2.708459 | 3.0 | 2.0 |
| 2 | 2.0 | NaN | 0.0 | 2.0 |
| 3 | 3.0 | 1.798122 | 3.0 | 1.0 |
| 4 | NaN | -1.898219 | 3.0 | 2.0 |
| ... | ... | ... | ... | ... |
| 995 | 995.0 | -1.886973 | 1.0 | 2.0 |
| 996 | 996.0 | -0.281315 | 4.0 | 0.0 |
| 997 | 997.0 | 0.764996 | 2.0 | 2.0 |
| 998 | 998.0 | NaN | 0.0 | 2.0 |
| 999 | 999.0 | 0.436428 | 1.0 | 0.0 |
1000 rows × 4 columns
msno.matrix(df_with_na)
<AxesSubplot:>
msno.bar(df_with_na)
<AxesSubplot:>
msno.heatmap(df_with_na)
<AxesSubplot:>
msno.dendrogram(df_with_na)
<AxesSubplot:>
Permet davantage de flexibilité bien que moins facile à mettre en place
Création de la figure, puis des axes
fig = plt.figure(figsize=(8, 3))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y']);
Création de la figure et des axes en une commande
fig, axes = plt.subplots(1, 1, figsize=(8, 3))
axes.scatter(df['x'], df['y']);
Création de la figure, puis des axes
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(
x=df['x'],
y=df['y'],
mode="markers+text",
name="Markers and Text",
))
Attaquons nous directement aux grilles de graphs, cela permet également de comprendre comment se crée un graph plus simple en détail :
Création basique d'une figure avec plusieurs graphs "subplots"
fig, axes = plt.subplots(2, 3)
Création basique d'une figure avec plusieurs graphs et une taille spécifique "figsize"
fig, axes = plt.subplots(2, 3, figsize=(10, 7))
Ajout d'un premier graph (en haut à gauche)
fig, axes = plt.subplots(2, 3, figsize=(10, 7))
axes[0, 0].scatter(x, y)
<matplotlib.collections.PathCollection at 0x164b5a4f0>
Ajout de plusieurs graphs
fig, axes = plt.subplots(2, 3, figsize=(10, 7))
# Plots on first row
axes[0, 0].scatter(x, y)
axes[0, 1].scatter(x, y, c='red')
axes[0, 2].scatter(x, y, c='green')
# Plots on second row
axes[1, 0].scatter(x, y, marker='+')
axes[1, 1].scatter(x, y, marker='+', c='red')
axes[1, 2].scatter(x, y, marker='+', c='green')
<matplotlib.collections.PathCollection at 0x164a94610>
Ajout d'un titre, et de labels
fig, axes = plt.subplots(2, 3, figsize=(10, 7))
# Plots on first row
axes[0, 0].scatter(x, y)
axes[0, 1].scatter(x, y, c='red')
axes[0, 2].scatter(x, y, c='green')
# Plots on second row
axes[1, 0].scatter(x, y, marker='+')
axes[1, 1].scatter(x, y, marker='+', c='red')
axes[1, 2].scatter(x, y, marker='+', c='green')
# Title
fig.suptitle('Multiple graphs')
# Label 1 (for top left plot)
axes[0, 0].set_xlabel('occurences')
axes[0, 0].set_ylabel('location')
# Label 2
axes[0, 1].set_xlabel('occurences')
axes[0, 1].set_ylabel('location')
# Label 3
axes[0, 2].set_xlabel('occurences')
axes[0, 2].set_ylabel('location')
# Label 4
axes[1, 0].set_xlabel('occurences')
axes[1, 0].set_ylabel('location')
# Label 5
axes[1, 1].set_xlabel('occurences')
axes[1, 1].set_ylabel('location')
# Label 6
axes[1, 2].set_xlabel('occurences')
axes[1, 2].set_ylabel('location')
Text(0, 0.5, 'location')
fig, axes = plt.subplots(2, 3, figsize=(10, 7))
# Plots on first row
axes[0, 0].scatter(x, y)
axes[0, 1].scatter(x, y, c='red')
axes[0, 2].scatter(x, y, c='green')
# Plots on second row
axes[1, 0].scatter(x, y, marker='+')
axes[1, 1].scatter(x, y, marker='+', c='red')
axes[1, 2].scatter(x, y, marker='+', c='green')
# Title
fig.suptitle('Multiple graphs')
# Label 1 (for top left plot)
axes[0, 0].set_xlabel('occurences')
axes[0, 0].set_ylabel('location')
# Label 2
axes[0, 1].set_xlabel('occurences')
axes[0, 1].set_ylabel('location')
# Label 3
axes[0, 2].set_xlabel('occurences')
axes[0, 2].set_ylabel('location')
# Label 4
axes[1, 0].set_xlabel('occurences')
axes[1, 0].set_ylabel('location')
# Label 5
axes[1, 1].set_xlabel('occurences')
axes[1, 1].set_ylabel('location')
# Label 6
axes[1, 2].set_xlabel('occurences')
axes[1, 2].set_ylabel('location')
fig.tight_layout()
Partage d'axe entre les graphs: "sharex", "sharey"
fig, axes = plt.subplots(2, 3, figsize=(10, 7), sharey='row')
# Plots on first row
axes[0, 0].scatter(x, y)
axes[0, 1].scatter(x, y, c='red')
axes[0, 2].scatter(x, y, c='green')
# Plots on second row
axes[1, 0].scatter(x, y, marker='+')
axes[1, 1].scatter(x, y, marker='+', c='red')
axes[1, 2].scatter(x, y, marker='+', c='green')
# Title
fig.suptitle('Multiple graphs')
# Label 1 (for top left plot)
axes[0, 0].set_xlabel('occurences')
axes[0, 0].set_ylabel('location')
# croping the axis will benefit to other graphs in same row
axes[0, 0].set_ylim(0, 3)
# Label 2
axes[0, 1].set_xlabel('occurences')
# Label 3
axes[0, 2].set_xlabel('occurences')
# Label 4
axes[1, 0].set_xlabel('occurences')
axes[1, 0].set_ylabel('location')
# Label 5
axes[1, 1].set_xlabel('occurences')
axes[1, 1].set_ylabel('location')
# Label 6
axes[1, 2].set_xlabel('occurences')
axes[1, 2].set_ylabel('location')
fig.tight_layout()
Une approche consiste à utiliser les subplots de Matplotlib, et d'y insérer des graphs constitués avec Seaborn
fig, axes = plt.subplots(2, 3)
sns.scatterplot(ax=axes[0, 0], x=x, y=y)
<AxesSubplot:>
Avec l'utilisation d'une DataFrame plutôt que des Numpy arrays :
fig, axes = plt.subplots(2, 3)
sns.scatterplot(ax=axes[0, 0], data=df, x='x', y='y')
<AxesSubplot:xlabel='x', ylabel='y'>
Une approche plus directe cependant repose sur l'utilisation des grilles Faceted de Seaborn
Il est possible d'utiliser ces grilles Faceted sans même les nommer
# Rappelez-vous des colonnes de notre DataFrame
df.head(3)
| x | y | colors | cat | |
|---|---|---|---|---|
| 0 | 0 | 0.191557 | 1 | 0 |
| 1 | 1 | 2.708459 | 3 | 2 |
| 2 | 2 | 0.353364 | 0 | 2 |
sns.relplot(data=df, x='x', y='y', hue='colors', col='cat')
<seaborn.axisgrid.FacetGrid at 0x1654dba60>
Il est aussi possible d'obtenir un graph pour chaque paire de variables avec pairplot
sns.pairplot(df, hue='cat', height=2.5)
<seaborn.axisgrid.PairGrid at 0x1669a7d60>
Pour davantage de contrôle, il est possible d'utiliser :
Il s'agit alors de décomposer en plusieurs étapes comme avec matplotlib:
# Pour les paires de variables
g = sns.PairGrid(df, hue="colors")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()
<seaborn.axisgrid.PairGrid at 0x16523d5b0>
# Contrôle diagonale, partie haute, partie basse
g = sns.PairGrid(df, hue='cat')
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=3, legend=False)
<seaborn.axisgrid.PairGrid at 0x166dc1b80>
# Pour l'affichage conditionnel d'une même variable
g = sns.FacetGrid(df, col="colors")
g.map(sns.histplot, "cat")
<seaborn.axisgrid.FacetGrid at 0x166d789a0>
Pour davantage d'exemples et d'explications: https://seaborn.pydata.org/tutorial/axis_grids.html
# Download wav file
!wget -q "https://bigsoundbank.com/UPLOAD/wav/0003.wav" -O "data/audio.wav"
# Load audio file
import wave
obj = wave.open('data/audio.wav', 'rb')
signal_wave = obj.readframes(-1)
signal_array = np.frombuffer(signal_wave, dtype=np.int16)
# Compute duration and time ticks
print('Parameters:', obj.getparams())
sample_freq = obj.getframerate()
n_samples = obj.getnframes()
duration = n_samples/sample_freq
time = np.linspace(0, duration, num=n_samples)
Parameters: _wave_params(nchannels=1, sampwidth=2, framerate=44100, nframes=559930, comptype='NONE', compname='not compressed')
Matplotlib
plt.figure(figsize=(15, 5))
plt.plot(time, signal_array)
plt.title('Audio Plot')
plt.ylabel('Signal wave')
plt.xlabel('Time (s)')
plt.xlim(0, max(time))
plt.show()
Seaborn
Plotly
fig = go.Figure([go.Scatter(x=time, y=signal_array)])
fig.show()
plt.figure(figsize=(15, 5))
plt.specgram(signal_array, Fs=sample_freq, vmin=-30, vmax=50)
plt.title('Left Channel')
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time (s)')
plt.colorbar()
plt.show()
# Download mri image
!wget -q "https://physionet.org/files/images/1.0.0/E1154S7I000.png?download" -O "data/img.png"
# Load image
import matplotlib.image as mpimg
img = mpimg.imread('data/img.png')
Matplotlib
# Plot image
plt.figure(figsize=(6,6))
plt.grid(visible=None)
plt.imshow(img)
<matplotlib.image.AxesImage at 0x16c189190>
Seaborn
import seaborn_image as isns
fig, ax = plt.subplots(figsize=(6,6))
isns.imgplot(img, ax=ax)
<AxesSubplot:>
Plotly
import plotly.express as px
fig = px.imshow(img)
fig.show()
fig = plt.figure(figsize=(6,6))
ax = plt.axes(projection='3d')
def f(x, y):
return np.sin(np.sqrt(x ** 2 + y ** 2))
x = np.linspace(-6, 6, 30)
y = np.linspace(-6, 6, 30)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)
fig = plt.figure(figsize=(6, 6))
ax = plt.axes(projection='3d')
ax.contour3D(X, Y, Z, 50, cmap='binary')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
Text(0.5, 0, 'z')
ax.view_init(60, 35)
fig
import plotly.express as px
df_iris = px.data.iris()
fig = px.scatter_3d(df_iris, x='sepal_length', y='sepal_width', z='petal_width',
color='species')
fig.show()
Pour davantage de possibilités
df['y'].hist(bins=30)
plt.vlines([-2, 2], ymin=0, ymax=80, colors='r')
<matplotlib.collections.LineCollection at 0x16c5bce80>
df.plot.scatter('x', 'y')
plt.hlines([-2, 2], xmin=0, xmax=1000, colors='r')
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
<matplotlib.collections.LineCollection at 0x16c5b9e20>
df.plot.scatter('x', 'y')
bbox_props = {
'boxstyle': 'circle',
'pad': 2,
'facecolor': 'none',
'edgecolor': 'r',
}
plt.annotate("anomaly", (0, 3), bbox = bbox_props, clip_on=True)
plt.show()
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
df.plot.scatter('x', 'y')
# Position départ de la flêche (x, y), décalage par rapport au point de départ (x, y)
plt.arrow(600, 5, -280, -2.25, head_width=.3, head_length=25, linewidth=.5, color='r')
plt.text(610, 4.9, "anomaly", color='r', fontsize=15)
plt.show()
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
df.plot.scatter('x', 'y')
plt.annotate(text='', xy=(133,-2.83), xytext=(410,-2.83), arrowprops=dict(arrowstyle='<->', color='r'))
plt.text(170, -3.3, "||AB||=277", color='r', fontsize=13)
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
Text(170, -3.3, '||AB||=277')
df.plot.scatter('x', 'y')
# Position point fleche (xy), décalage (x, y%)
plt.annotate(
r'$>1.96\sigma$', xy=(100, 2.7), xytext=(100,40),
textcoords='offset points', ha='center', va='bottom',color='k',
bbox=dict(boxstyle='round,pad=0.2', fc='grey', alpha=0.35),
arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.2',
color='r'))
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
Text(100, 40, '$>1.96\\sigma$')
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(
x=df['x'],
y=df['y'],
mode="markers+text",
name="Markers and Text",
))
fig.add_trace(go.Scatter(
x=[0, 400],
y=[3, 3],
mode="lines+text",
name="Lines and Text",
text=["Text G", "Text H", "Text I"],
textposition="bottom center"
))
df['y'].hist()
plt.savefig('hist_y.pdf')
plt.savefig('hist_y.png')
from matplotlib.backends.backend_pdf import PdfPages
with PdfPages('graph_list.pdf') as pdf:
fig = plt.scatter(df['x'], df['y']).get_figure()
pdf.savefig(fig)
fig = df['y'].hist().get_figure()
pdf.savefig(fig)
from pandas_profiling import ProfileReport
profile = ProfileReport(df_with_na, title="Random data")
profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
profile.to_file("random_data.html")
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]